Mineria de datos - Proyecto Final
Librerías
library(tidyverse)
library(rmdformats)
library(stats)
library(cluster)
library(mclust)
library(factoextra)
library(dendextend)
library(DT)
library(purrr)
library(igraph)
library(tidygraph)
library(ggraph)
library(ggpubr)
library(clustertend)
library(fpc)
library(FactoMineR)
library(factoextra)
library(pvclust)
library(cluster.datasets)
library(mltools)
library(data.table)Datos
file <- "/Users/antony.vargasulead.ac.cr/Mineria de datos/Proyecto Final/Data/Heart.csv"
df <- read.csv(file, sep = ",", dec = ".")
df <- df[,!(names(df) %in% "target")]
DT::datatable(df)2. Análisis descriptivo
2.1 Datos cualitativos
df_cualitativo <- df[, c("age", "trestbps", "chol", "thalach", "oldpeak")]
head(df_cualitativo)## age trestbps chol thalach oldpeak
## 1 52 125 212 168 1.0
## 2 53 140 203 155 3.1
## 3 70 145 174 125 2.6
## 4 61 148 203 161 0.0
## 5 62 138 294 106 1.9
## 6 58 100 248 122 1.0
2.2 Resumen
print("Resumen estadistico")## [1] "Resumen estadistico"
summary(df_cualitativo)## age trestbps chol thalach oldpeak
## Min. :29.00 Min. : 94.0 Min. :126 Min. : 71.0 Min. :0.000
## 1st Qu.:48.00 1st Qu.:120.0 1st Qu.:211 1st Qu.:132.0 1st Qu.:0.000
## Median :56.00 Median :130.0 Median :240 Median :152.0 Median :0.800
## Mean :54.43 Mean :131.6 Mean :246 Mean :149.1 Mean :1.072
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:275 3rd Qu.:166.0 3rd Qu.:1.800
## Max. :77.00 Max. :200.0 Max. :564 Max. :202.0 Max. :6.200
2.3 Desviaciones estándar
for (i in colnames(df_cualitativo)){
print(i)
print((sd(df[, i])))
}## [1] "age"
## [1] 9.07229
## [1] "trestbps"
## [1] 17.51672
## [1] "chol"
## [1] 51.59251
## [1] "thalach"
## [1] 23.00572
## [1] "oldpeak"
## [1] 1.175053
2.4 Cuartiles en diagrama de cajas
par(mfrow=c(2,3))
boxplot(df_cualitativo$age,
main = "Boxplot - Age",
ylab = "Age",
col = "orange",
border = "brown",
horizontal = FALSE,
notch = TRUE)
boxplot(df_cualitativo$trestbps,
main = "Boxplot - Trestbps",
ylab = "Trestbps",
col = "orange",
border = "brown",
horizontal = FALSE,
notch = TRUE)
boxplot(df_cualitativo$chol,
main = "Boxplot - Chol",
ylab = "Chol",
col = "orange",
border = "brown",
horizontal = FALSE,
notch = TRUE)
boxplot(df_cualitativo$thalach,
main = "Boxplot - Thalach",
ylab = "thalach",
col = "orange",
border = "brown",
horizontal = FALSE,
notch = TRUE)
boxplot(df_cualitativo$oldpeak,
main = "Boxplot - Oldpeak",
ylab = "Oldpeak",
col = "orange",
border = "brown",
horizontal = FALSE,
notch = TRUE)3. Análisis no Supervisado:
3.1 Análisis de componentes principales
PCA <- PCA(df, graph = FALSE, dim(df)[2])
PCA## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1025 individuals, described by 13 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
3.1.1 Tabla general
eig.tmp <- PCA$eig
eig.tmp[,2:3]<-eig.tmp[,2:3]/100.
DT::datatable(eig.tmp) %>%
formatRound('eigenvalue',2) %>%
formatPercentage(c('percentage of variance','cumulative percentage of variance'),2)3.1.2 Gráfico de sedimentación
ggplot(data = data.frame(prop_varianza_acum = PCA$eig[,3], pc = 1:dim(PCA$eig)[1]),
aes(x = pc, y = prop_varianza_acum, group = 1)) +
geom_point() +
geom_line() +
theme_bw() +
labs(x = "Componente principal",
y = "Prop. varianza explicada acumulada")3.1.3 Tabla de cosenos cuadrados - individuos
DT::datatable(PCA$ind$cos2) %>%
formatPercentage(colnames(PCA$ind$cos2),2)3.1.4 Tabla de contribuciones - individuos
DT::datatable(PCA$ind$contrib) %>%
formatRound(colnames(PCA$ind$contrib),3)3.1.5 Tabla de cosenos cuadrados - variables
DT::datatable(PCA$var$cos2) %>%
formatPercentage(colnames(PCA$var$cos2),2)3.1.6 Tabla de contribuciones - variables
DT::datatable(PCA$var$contrib) %>%
formatRound(colnames(PCA$var$contrib),3)3.1.7 Plano principal - Cosenos cuadrados de individuos
fviz_pca_ind(PCA, col.ind="cos2", select.ind = list(cos2 = 0.60), geom = "point",
gradient.cols = c("black", "#2E9FDF", "#FC4E07" ), title = "Cosenos cuadrados - individuos")3.1.8 Plano principal - Contribución de individuos
fviz_pca_ind(PCA, col.ind="contrib", geom = "point",
gradient.cols = c("black", "#2E9FDF", "#FC4E07" ), title = "Ejemplo 1 Contribución",repel = TRUE)3.1.9 Cículo de correlación - Cosenos cuadrados individuos
fviz_pca_var(PCA, col.var = "cos2",
gradient.cols = c("black", "blue", "red"),
ggtheme = theme_minimal())3.1.10 Cículo de correlación - Contribución individuos
fviz_pca_var(PCA, col.var = "contrib",
gradient.cols = c("black", "blue", "red"),
ggtheme = theme_minimal()) ### 3.1.11 Correlación entre variables originales y los componentes principales
library(corrplot)## corrplot 0.90 loaded
corrplot(PCA$var$cor)3.2 Análisis de correspondencia simple
3.2.1 Aplicación del Análisis de correspondencia simple (ACS)
ACS <- CA(df, graph = TRUE, dim(df)[2])3.2.2 Valores propios - inercia explicada
fviz_eig(ACS, linecolor = "#FC4E07",
barcolor = "#00AFBB", barfill = "#00AFBB")3.2.3 Plano principal - Cosenos cuadrados de individuos
fviz_ca_row(ACS, select.row = list(cos2 = 0.60), col.row = "cos2", geom = "point",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)3.2.4 Plano principal - Cosenos cuadrado de variables
fviz_ca_col(ACS, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))3.2.4 Gráfico de sobreposición
fviz_ca_biplot(ACS, select.row = list(cos2 = 0.80), repel = TRUE)## Warning: ggrepel: 150 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
3.4 Sentido de la clusterización en el problema
set.seed(321)
df_scale <- scale(df, center = TRUE, scale = TRUE)
hopkins(data = df_scale, n = nrow(df_scale) - 1)## $H
## [1] 0.3234038
No tiene mucho sentido la clusterización de este problema, ya que el índice de Hopkins da un resultado de 32.3%, superior al 25%, el cual indica que no tiene mucho sentido realizar la clusterización. Un resultado cercano al 50% indica que del todo no se puede clusterizar y el índice se hacer a ese porcentaje
3.5 Clusterización
3.5.1 Kmeans
km.datos <- kmeans(x = df_scale, centers = 5)
p1 <- fviz_cluster(object = km.datos, data = df_scale,
ellipse.type = "norm", geom = "point", main = "Datos iris",
stand = FALSE, palette = "jco") +
theme_bw() + theme(legend.position = "none")
p13.5.2 Clustering jerárquico
hc <- hclust(d = dist(x = df_scale, method = "euclidean"),
method = "complete")
fviz_dend(x = hc, k = 5, cex = 0.6) +
labs(title = "Herarchical clustering",
subtitle = "Distancia euclídea, Lincage complete, K=2")## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
3.6 Número óptimo de k clúster
jambu <- fviz_nbclust(x = df_scale, FUNcluster = kmeans, method = "wss", k.max = 15,
diss = get_dist(df_scale, method = "euclidean"), nstart = 50)
jambuEs difícil determinar la cantidad k óptima de clúster, ya que en el Codo de Jambú se nota que la línea no se lográ estabilizarse, sin embargo a criterio, podemos determinar que el mejor número es k = 5, sin embargo no se sabe con certeza.
3.7 Indicadores para la evaluación de los clústeres
3.7.1 Visual Assessment of cluster Tendency (VAT)
dist_data <- dist(df_scale, method = "euclidean")
p2 <- fviz_dist(dist.obj = dist_data, show_labels = FALSE) +
labs(title = "Datos - Heart") + theme(legend.position = "bottom")
p2